Show the code
#|
R1_count <- sapply(
R1_st,
function(st) {
openalexR::oa_fetch(
title_and_abstract.search = compact(st),
count_only = TRUE,
verbose = TRUE
)[, "count"]
}
) |>
sum()Data Management Report
A short description what this is about. This is not a tracditional abstract, but rather something else …
IPBES_TCA_Ch5_Subsidies_Reform
%The BuidNo is automatically increased by one each time the report is rendered. It is used to indicate different renderings when the version stays the same%.
The search terms are based on the shared google doc. They are cleaned up for the usage in OpenAlex.
R1 count from OpenAlexThe search terms is R1
#|
R1_count <- sapply(
R1_st,
function(st) {
openalexR::oa_fetch(
title_and_abstract.search = compact(st),
count_only = TRUE,
verbose = TRUE
)[, "count"]
}
) |>
sum()R2 count from OpenAlexThe search terms is R2
#|
R2_count <- openalexR::oa_fetch(
title_and_abstract.search = compact(R2_st),
count_only = TRUE,
verbose = TRUE
)[, "count"]R1 AND R2 count from OpenAlex#|
R1_R2_count <- sapply(
R1_st,
function(st) {
openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", st, ") AND (", R2_st, ")")),
count_only = TRUE,
output = "list",
verbose = TRUE
)$count
}
) |>
sum()R1 AND R2 CorpusThe corpus download will be stored in data/pages and the arrow database in data/corpus.
This is not on github!
The corpus can be read by running read_corpus() which opens the database so that then it can be fed into a dplyr pipeline. After most dplyr functions, the actual data needs to be collected via dplyr::collect().
Only then is the actual data read!
Needs to be enabled by setting eval: true in the code block below.
#|
tic()
get_corpus_pages <- function(search, pages_dir, rds_dir = "pages_publication_year=") {
dir.create(
path = pages_dir,
showWarnings = FALSE,
recursive = TRUE
)
years <- oa_fetch(
title_and_abstract.search = compact(search),
group_by = "publication_year",
paging = "cursor",
verbose = FALSE
)$key
#######
#######
# processed <- list.dirs(
# path = pages_dir,
# full.names = FALSE,
# recursive = FALSE
# ) |>
# gsub(
# pattern = paste0("^pages_publication_year=", ""),
# replacement = ""
# )
# interrupted <- list.files(
# path = pages_dir,
# pattern = "^next_page.rds",
# full.names = TRUE,
# recursive = TRUE
# ) |>
# gsub(
# pattern = paste0("^", pages_dir, "/pages_publication_year=", ""),
# replacement = ""
# ) |>
# gsub(
# pattern = "/next_page.rds$",
# replacement = ""
# )
# completed <- processed[!(processed %in% interrupted)]
# years <- years[!(years %in% completed)]
#######
#######
result <- pbmcapply::pbmclapply(
sample(years),
function(y) {
message("\nGetting data for year ", y, " ...")
output_path <- file.path(pages_dir, paste0(rds_dir, y))
dir.create(
path = output_path,
showWarnings = FALSE,
recursive = TRUE
)
data <- oa_query(
title_and_abstract.search = compact(search),
publication_year = y,
options = list(
select = c("id", "doi", "authorships", "publication_year", "display_name", "abstract_inverted_index", "topics")
),
verbose = FALSE
) |>
IPBES.R::oa_request_IPBES(
count_only = FALSE,
output_path = output_path,
verbose = TRUE
)
},
mc.cores = 8,
mc.preschedule = FALSE
) |>
unlist()
invisible(result)
}
sts <- sapply(
R1_st,
function(st) {
compact(paste0("(", st, ") AND (", R2_st, ")"))
}
)
get_corpus_pages(
search = sts[[1]],
pages_dir = file.path(".", "data", "pages"),
rds_dir = "pages_1_publication_year="
)
get_corpus_pages(
search = sts[[2]],
pages_dir = file.path(".", "data", "pages"),
rds_dir = "pages_2_publication_year="
)
toc()The fields author and topics are serialized in the arrow database and need to be unserialized by using unserialize_arrow() on a dataset containing the two columns.
tic()
pages_dir <- file.path(".", "data", "pages")
arrow_dir <- file.path(".", "data", "corpus")
years <- list.dirs(
path = pages_dir,
full.names = TRUE,
recursive = FALSE
) |>
strsplit(
split = "="
) |>
sapply(
FUN = function(x) {
x[2]
}
) |>
unique() |>
sort()
# years_done <- list.dirs(
# path = arrow_dir,
# full.names = TRUE,
# recursive = FALSE
# )
# years <- years[
# (
# gsub(
# x = years,
# pattern = paste0("^", pages_dir, "/pages_publication_year="),
# replacement = ""
# ) # %in% gsub(
# # x = years_done,
# # pattern = paste0("^", arrow_dir, "/publication_year="),
# # replacement = ""
# # )
# )
# ]
result <- pbapply::pblapply(
sample(years),
function(year) {
message("\n Processing year ", year, " ...\n")
pages <- c(
list.files(
path = pages_dir,
pattern = paste0("\\.rds$"),
full.names = TRUE,
recursive = TRUE
) |>
grep(
pattern = year,
value = TRUE
) |>
grep(
pattern = ".*publication.*",
value = TRUE
)
)
data <- parallel::mclapply(
pages,
function(page) {
# message("Processing ", page, " ...")
p <- readRDS(file.path(page))$results
if (length(p) == 0) {
p <- NULL
} else {
p <- openalexR::works2df(p, verbose = FALSE)
p$author_abbr <- IPBES.R::abbreviate_authors(p)
}
return(p)
},
mc.cores = 1 # params$mc.cores
) |>
do.call(what = dplyr::bind_rows) |>
distinct(id, .keep_all = TRUE)
saveRDS(
data,
file = file.path(pages_dir, paste0(year, ".rds"))
)
data <- serialize_arrow(data)
arrow::write_dataset(
data,
path = arrow_dir,
partitioning = "publication_year",
format = "parquet",
existing_data_behavior = "overwrite"
)
}
)
arrow_dir <- file.path(".", "data", "corpus_tca")
read_corpus("data/corpus") |>
dplyr::collect() |>
dplyr::filter(id %in% readRDS("data/ids_subs_tca.rds")) |>
arrow::write_dataset(
path = arrow_dir,
partitioning = "publication_year",
format = "parquet",
existing_data_behavior = "overwrite"
)
toc()#|
ids_subsidies <- read_corpus(file.path("data", "corpus")) |>
dplyr::select(id) |>
collect() |>
unlist()Warning: Invalid metadata$r
ids_tca <- read_corpus(file.path("..", "IPBES_TCA_Corpus", "data", "corpus")) |>
dplyr::select(id) |>
collect() |>
unlist()Warning: Invalid metadata$r
ids_subs_tca <- ids_tca[ids_tca %in% ids_subsidies]
saveRDS(ids_subs_tca, file = file.path("data", "ids_subs_tca.rds"))#|
read_corpus("data/corpus") |>
dplyr::select(id, publication_year, ab) |>
dplyr::collect() |>
write.table(file = "sent_analysis_subsidies.txt")
read_corpus("data/corpus_tca") |>
dplyr::select(id, publication_year, ab) |>
dplyr::collect() |>
write.table(file = "sent_analysis_subsidies_tca.txt")#|
set.seed(13)
read_corpus("data/corpus") |>
dplyr::select(id, doi, author_abbr, display_name, ab) |>
dplyr::rename(abstract = ab, title = display_name) |>
dplyr::slice_sample(n = 50) |>
dplyr::collect() |>
writexl::write_xlsx(path = "random_50_subsidies.xlsx")
set.seed(14)
read_corpus("data/corpus_tca") |>
dplyr::select(id, doi, author_abbr, display_name, ab) |>
dplyr::rename(abstract = ab, title = display_name) |>
dplyr::slice_sample(n = 50) |>
dplyr::collect() |>
writexl::write_xlsx(path = "random_50_subsidies_in_tca.xlsx")The number of hits are hits of the terms of the whole of the OpenAlex corpus. Due to methodological issues, the number of R1 AND R2 are overestimates and contain some double counting.
R1 in OpenAlex: 114,188 hitsR2 in OpenAlex: 34,396,332 hitsR1 AND R2: in OpenAlex 40,884 hitsR1 AND R2 in TCA corpus: 12,095 hitsThe file contains the id, doi, author_abbr and abstract of the papers. Two samples were generated:
Two csv containing the id, publication_year and `ab’ (abstract) were extracted:
The red line is the cumulative proportion of publications, the blue line the cumulative proportion of all of the Op[enAlex corpus. Both use the secondeary (red) axis.
data <- read_corpus(file.path("data", "corpus_tca")) |>
dplyr::select(publication_year) |>
dplyr::arrange(publication_year) |>
dplyr::collect() |>
table() |>
as.data.frame() |>
mutate(
publication_year = as.integer(as.character(publication_year)),
p = Freq / sum(Freq),
p_cum = cumsum(p)
) |>
rename(
count = Freq
) |>
dplyr::inner_join(
y = openalexR::oa_fetch(
group_by = "publication_year",
output = "tibble",
verbose = FALSE
) |>
dplyr::select(
key,
count
) |>
dplyr::rename(
publication_year = key,
count_oa = count
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
publication_year = as.integer(as.character(publication_year)),
p_oa = count_oa / sum(count_oa),
p_oa_cum = cumsum(p_oa)
)
)Warning: Invalid metadata$r
figure <- data |>
dplyr::filter(publication_year >= 1900) |>
ggplot() +
geom_bar(aes(x = publication_year, y = p), stat = "identity") +
geom_line(aes(x = publication_year, y = p_cum / 5), color = "red") +
geom_line(aes(x = publication_year, y = p_oa_cum / 5), color = "blue") +
scale_x_continuous(breaks = seq(1900, 2020, 10)) +
scale_y_continuous(
"Proportion of publications",
sec.axis = sec_axis(~ . * 5, name = "Cumulative proportion") # divide by 100 to scale back the secondary axis
) +
labs(
title = "Publications over time",
x = "Year",
y = "Number of publications"
) +
theme_minimal() +
theme(axis.text.y.right = element_text(color = "red"))
ggplot2::ggsave(
file.path("figures", "publications_over_time.pdf"),
width = 12,
height = 6,
figure
)
ggplot2::ggsave(
file.path("figures", "publications_over_time.png"),
width = 12,
height = 6,
figure
)
rm(figure)To download high resolution, click here
data |> IPBES.R::table_dt(fn = "publications_per_country")rm(data)@report{krug,
author = {Krug, Rainer M.},
title = {Report {Assessment} {Ch5} {Subsidies} {Reform}},
doi = {XXXXXX},
langid = {en},
abstract = {A short description what this is about. This is not a
tracditional abstract, but rather something else ...}
}